Code
import polars as pl
import altair as alt
from read_parquet_and_reorder import read_parquet_and_reorder
from one_hot_encode import one_hot_encode
from blog import logger

alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
Code
logger.setLevel("INFO")
Code
df = read_parquet_and_reorder("df.parquet")
logger.info(df.shape)
df_per_100g = df.select("code", *[c for c in df.columns if c.endswith("_100g")])
df = df.select(c for c in df.columns if not c.endswith("_100g"))

columns = [
    "categories_en",
    "ingredients_tags",
    "ingredients_analysis_tags",
    "traces_en",
    "food_groups_en",
    "nutrient_levels_tags",
    "main_category_en",
    "packaging_en",
]
df_dict: dict[str, pl.DataFrame] = {
    c: df.pipe(one_hot_encode, c, n=10, remove_prefix=["en:", "de:"]) for c in columns
} | {"nutrients": df_per_100g}
[06/30/23 09:12:04] INFO     (73307, 175)                                                           2632950586.py:2
[06/30/23 09:12:05] INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
                    INFO     (73307, 11)                                                       one_hot_encode.py:58
Code
df_for_ml = df.select("code")
for key, _df in df_dict.items():
    logger.info(key)
    df_for_ml = df_for_ml.join(_df, on="code", suffix=key)
df_for_ml = df_for_ml.fill_null(0.0)
                    INFO     categories_en                                                           288844683.py:3
                    INFO     ingredients_tags                                                        288844683.py:3
                    INFO     ingredients_analysis_tags                                               288844683.py:3
                    INFO     traces_en                                                               288844683.py:3
                    INFO     food_groups_en                                                          288844683.py:3
                    INFO     nutrient_levels_tags                                                    288844683.py:3
                    INFO     main_category_en                                                        288844683.py:3
                    INFO     packaging_en                                                            288844683.py:3
                    INFO     nutrients                                                               288844683.py:3
Code
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import collections
import numpy as np


y = df.select("nutriscore_score").to_numpy().flatten()
logger.info(y.shape)

X = df_for_ml.drop("code").to_numpy()
logger.info(X.shape)
[06/30/23 09:12:30] INFO     (73307,)                                                               2128009356.py:9
                    INFO     (73307, 198)                                                          2128009356.py:12
Code
transformer = Normalizer().fit(X)

X_train, X_test, y_train, y_test = train_test_split(
    transformer.transform(X), y, test_size=0.80, random_state=2023
)


clf = tree.DecisionTreeClassifier(max_depth=15)
clf = clf.fit(X_train, y_train)


df_tree = pl.concat(
    [
        pl.DataFrame(
            {
                "actual score": y_test,
                "predicted score": clf.predict(X_test),
                "label": "test",
            }
        ),
        pl.DataFrame(
            {
                "actual score": y_train,
                "predicted score": clf.predict(X_train),
                "label": "train",
            }
        ),
    ]
).with_columns(err=pl.col("predicted score") - pl.col("actual score"))


chart = (
    alt.Chart(df_tree)
    .mark_point(filled=True, opacity=0.02)
    .encode(
        x="actual score:Q",
        y="predicted score:Q",
        # y="err:Q",
        color="label:N",
        column="label:N",
    )
    .properties(width=300, height=300)
)

display(chart)
Code
df_tree["predicted score"].unique()
shape: (45,)
predicted score
i64
-13
-12
-11
-10
-9
-8
-7
-6
-5
-4
-3
-2
20
21
22
23
24
25
26
27
28
29
30
35
Code
df_tree.groupby("actual score", "predicted score").count().pivot(
    index="actual score",
    columns="predicted score",
    values="count",
    aggregate_function=None,
).fill_null(0).sort("actual score").select(
    "actual score",
    *[str(i) for i in range(-13, 36) if i in df_tree["predicted score"].unique()]
)
ColumnNotFoundError: 31

Error originated just after this operation:
DF ["actual score", "13", "-6", "3"]; PROJECT */46 COLUMNS; SELECTION: "None"